Package org.terrier.structures.indexing.singlepass

Source Code of org.terrier.structures.indexing.singlepass.Posting

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is Posting.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*   Roi Blanco (rblanc{at}@udc.es)
*   Craig Macdonald (craigm{at}dcs.gla.ac.uk)
*/

package org.terrier.structures.indexing.singlepass;

import java.io.IOException;

import org.terrier.compression.MemorySBOS;

/**
* Class representing a simple posting list in memory.
* It keeps the information for <code>TF, Nt</code>, and the sequence <code>[doc, tf]</code>
* @author Roi Blanco
*
*/
public class Posting {
 
  /** The term frequency */
  protected int TF;
  /** The document frequency */
  protected int Nt;
  /** The compressed in-memory object holding the sequence doc_id, idf*/ 
  protected MemorySBOS docIds;
  /** Last document inserted in the posting */
  protected int lastInt = 0;
   
  /**
   * Writes the first document in the posting list.
   * @param docId the document identifier.
   * @param freq the frequency of the term in the document.
   * @return the number of bytes consumed in the buffer
   * @throws IOException if an I/O error ocurrs.
   */ 
  public int writeFirstDoc(int docId, int freq) throws IOException{   
    docIds = new MemorySBOS();
    TF = freq;     
    Nt = 1;
    //System.err.println("Writing docid="+ (docId+1) + " f=" + freq);
    docIds.writeGamma(docId + 1);
    docIds.writeGamma(freq);
    lastInt = docId;
    return docIds.getSize();
  }
 
  /**
   * Inserts a new document in the posting list. Document insertions must be done
   * in order. 
   * @param doc the document identifier.
   * @param freq the frequency of the term in the document.
   * @return the number of bytes consumed in the buffer
   * @throws IOException if and I/O error occurs.
   */
  public int insert(int doc, int freq) throws IOException
  {   
    final int bytes = docIds.getSize();
    Nt++;
    TF += freq;
    docIds.writeGamma(doc - lastInt);
    docIds.writeGamma(freq);
    lastInt = doc;         
    return docIds.getSize() - bytes;
  }

  /**
   * @return the term frequency of the term in the run
   */
  public int getTF(){
    return TF;
  }
 
  /**
   * @return the document data compressed object.
   */
  public MemorySBOS getDocs(){
      return docIds;
 
 
  /**
   * Sets the term frequency in the run.
   * @param tf the term frequency.
   */
  public void setTF(int tf){
    this.TF = tf;
  }
 
  /**
   * Sets the document data compressed object.
   * @param docs
   */
  public void setDocs(MemorySBOS docs){
    this.docIds = docs;
  }

  /**
   * @return the document frequency - the number of documents this term occurs in
   */
  public int getDocF() {
    return Nt;
  }

  /**
   * Set the document frequency the number of documents this term occurs in.
   * @param docF the document frequency.
   */
  public void setDocF(int docF) {
    this.Nt = docF;
  }

  /** Returns the size of the underlying buffer representing this posting set. */
  public int getSize() {
    return docIds.getSize();
  }
}
TOP

Related Classes of org.terrier.structures.indexing.singlepass.Posting

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.